set.seed(1)
required_packages <- c("tidyverse", "magrittr", "DBI", "bigrquery", "arrow","glue", "vroom","janitor", "gt", "ggwordcloud", "readxl", "ggthemes", "hrbrthemes", "extrafont", "plotly", "scales", "stringr", "gganimate", "here", "tidytext", "sentimentr", "scales", "DT", "here", "sm", "mblm", "glue", "fs", "knitr", "rmdformats", "janitor", "urltools", "colorspace", "pdftools", "showtext", "pander", "wordcloud2", "stopwords", "magicfor", "gapminder")
for(i in required_packages) {
if(!require(i, character.only = T)) {
# if package is not existing, install then load the package
install.packages(i, dependencies = T)
require(i, character.only = T)
}
}
panderOptions('table.alignment.default', "left")
## quality of png's
dpi <- 750
## theme updates; please adjust to client´s website
#theme_set(ggthemes::theme_clean(base_size = 15))
theme_set(ggthemes::theme_clean(base_size = 15, base_family = "Montserrat"))
theme_update(plot.margin = margin(30, 30, 30, 30),
plot.background = element_rect(color = "white",
fill = "white"),
plot.title = element_text(size = 20,
face = "bold",
lineheight = 1.05,
hjust = .5,
margin = margin(10, 0, 25, 0)),
plot.title.position = "plot",
plot.caption = element_text(color = "grey40",
size = 9,
margin = margin(20, 0, -20, 0)),
plot.caption.position = "plot",
axis.line.x = element_line(color = "black",
size = .8),
axis.line.y = element_line(color = "black",
size = .8),
axis.title.x = element_text(size = 16,
face = "bold",
margin = margin(t = 20)),
axis.title.y = element_text(size = 16,
face = "bold",
margin = margin(r = 20)),
axis.text = element_text(size = 11,
color = "black",
face = "bold"),
axis.text.x = element_text(margin = margin(t = 10)),
axis.text.y = element_text(margin = margin(r = 10)),
axis.ticks = element_blank(),
panel.grid.major.x = element_line(size = .6,
color = "#eaeaea",
linetype = "solid"),
panel.grid.major.y = element_line(size = .6,
color = "#eaeaea",
linetype = "solid"),
panel.grid.minor.x = element_line(size = .6,
color = "#eaeaea",
linetype = "solid"),
panel.grid.minor.y = element_blank(),
panel.spacing.x = unit(4, "lines"),
panel.spacing.y = unit(2, "lines"),
legend.position = "top",
legend.title = element_text(family = "Montserrat",
color = "black",
size = 14,
margin = margin(5, 0, 5, 0)),
legend.text = element_text(family = "Montserrat",
color = "black",
size = 11,
margin = margin(4.5, 4.5, 4.5, 4.5)),
legend.background = element_rect(fill = NA,
color = NA),
legend.key = element_rect(color = NA, fill = NA),
#legend.key.width = unit(5, "lines"),
#legend.spacing.x = unit(.05, "pt"),
#legend.spacing.y = unit(.55, "pt"),
#legend.margin = margin(0, 0, 10, 0),
strip.text = element_text(face = "bold",
margin = margin(b = 10)))
## theme settings for flipped plots
theme_flip <-
theme(panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_line(size = .6,
color = "#eaeaea"))
## theme settings for charts without y axis
theme_blank <-
theme(panel.grid.major.x = element_blank(),
panel.grid.major.y = element_blank(),
axis.line.y = element_blank(),
axis.text.y = element_blank())
## numeric format for labels
num_format <- scales::format_format(big.mark = ",", small.mark = ",", scientific = F)
## main color backlinko
bl_col <- "#00d188"
bl_dark <- darken(bl_col, .3, space = "HLS")overview <- read_csv("../proc_data/overview.csv")
con <- dbConnect(
bigrquery::bigquery(),
project = "dataforseo-bigquery",
billing = "dataforseo-bigquery"
)pservices <- read_csv("../raw_data/productsservices.csv") %>%
clean_names() %>% rename(c1 = criterion_id) %>% select(-category) %>%
separate(c1, sep =",\"", into = c("id", "category")) %>%
mutate(category = substr(category, 2, nchar(category) -1)) %>%
separate(category, sep = "/", into = c("cat1", "cat2", "cat3", "cat4", "cat5", "cat6", "cat7", "cat8"))
toplevel <- pservices %>% filter(is.na(cat2))write_categories <- function()
{
get_category_volume <- function(id){
sql <- glue(
"SELECT Avg(keyword_info_search_volume) AS `search_volume`, COUNT(*) AS `count`
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
AND spell = ''
AND keyword_info_categories like '%{id}%' ")
tb <- bq_project_query("dataforseo-bigquery", sql)
bq_table_download(tb) %>% mutate(id = id)
}
df <- map_df(toplevel$id, get_category_volume)
df %>% mutate(search_volume = search_volume,
mean_volume = search_volume / count
) %>%
left_join(toplevel %>% select(id, cat1), by = "id") %>%
write_csv("../proc_data/categories_averages.csv")
}
#write_categories()
df <- read_csv("../proc_data/categories_averages.csv")df %>%
ggplot(aes(x = fct_reorder(cat1, search_volume), y = search_volume)) +
geom_bar(stat = "identity", fill = bl_dark, width = 0.7) +
coord_flip() +
theme_flip +
theme(panel.grid.major.y = element_blank(), axis.line.y = element_blank()) +
scale_y_continuous(limits = c(0, 3500), expand = c(0,0), breaks = seq(0, 3000, by = 1000)) +
labs(x = NULL, y = NULL, title = "Search volume mean by category")+
ggsave(here::here("plots", "reworked", "volume_category_ordered.pdf"),
width = 10, height = 7.5, device = cairo_pdf)!!!D: I would be curious to see the same graph with median. Wouldn´t it be better to use median given the skewed data set?
!!!J: I am skeptical for using the median, since it is brought far down by a large number of searches with low volume, even when we exclude 0 volume searches. Remember, the median cpc overall was 0. But you are right that it is an issue with the skewed data set.
!!!D: Maybe a boxplot or something similiar would make more sense here? Or something like that: https://tinyurl.com/yxfwo5vt Leave that out if it´s too complicated to implement.
write_categories_volume <- function()
{
get_category_volume <- function(id){
sql <- glue(
"SELECT SUM(COALESCE(keyword_info_search_volume / 10000, 0)) as volume
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
AND spell = ''
AND keyword_info_search_volume > 0
AND keyword_info_categories like '%{id}%' ")
tb <- bq_project_query("dataforseo-bigquery", sql)
bq_table_download(tb) %>% mutate(id = id)
}
df <- map_df(toplevel$id, get_category_volume) %>%
mutate(volume = volume * 10000)
df %>% left_join(toplevel %>% select(id, cat1), by = "id") %>%
write_csv("../proc_data/categories_total_volume.csv")
}
write_categories_volume()
df <- read_csv("../proc_data/categories_total_volume.csv")df %>%
mutate(volume = volume / sum(volume)) %>%
ggplot(aes(x = fct_reorder(cat1, volume), y = volume)) +
geom_bar(stat = "identity", fill = bl_dark, width = 0.7) +
geom_text(aes(label = glue::glue("{format(round(volume, 3) * 100, scientific = FALSE)}%")),
nudge_y = .002, family = "Montserrat", fontface = "bold",
color = "grey40", size = 2.7, hjust = 0) +
coord_flip() +
theme_flip +
theme(panel.grid.major.y = element_blank(), axis.line.y = element_blank()) +
scale_y_continuous(limits = c(0, .21), expand = c(0,0), labels = scales::percent) +
labs(x = NULL, y = NULL, title = "Industries With the Greatest Total Search\nVolume Include “News, Media & Publications”,\n“Internet & Telecom”, and “Arts & Entertainment”") +
ggsave(here::here("plots", "reworked", "volume_total_category_ordered.pdf"),
width = 10, height = 7.5, device = cairo_pdf)write_categories <- function()
{
get_category <- function(id){
sql <- glue(
"SELECT AVG(keyword_info_cpc) as mean_cpc
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
AND spell = ''
AND keyword_info_search_volume > 0
AND keyword_info_categories like '%{id}%' ")
tb <- bq_project_query("dataforseo-bigquery", sql)
bq_table_download(tb) %>% mutate(id = id)
}
df <- map_df(toplevel$id, get_category)
df %>% left_join(toplevel %>% select(id, cat1), by = "id") %>%
write_csv("../proc_data/categories_cpc.csv")
}
#write_categories()
df <- read_csv("../proc_data/categories_cpc.csv")df %>%
ggplot(aes(x = fct_reorder(cat1, mean_cpc), y = mean_cpc)) +
geom_bar(stat = "identity", fill = bl_dark, width = 0.7) +
geom_text(aes(label = format(round(mean_cpc, 2), scientific = FALSE)),
nudge_y = -.02, family = "Montserrat", hjust = 1,
size = 2.5, color = "white", fontface = "bold") +
coord_flip() +
theme_flip +
theme(panel.grid.major.y = element_blank(), axis.line.y = element_blank()) +
scale_y_continuous(expand = c(0,0), breaks = seq(0, 2, by = .25)) +
labs(x = NULL, y = NULL, title = "Finance, Real Estate, and Health Industries Have the Highest CPCs") +
ggsave(here::here("plots", "reworked", "cpc_mean_category_ordered.pdf"),
width = 10, height = 7, device = cairo_pdf)